This notebook defines the most focal recurrent copy number units by removing focal changes that are within entire chromosome arm losses and gains. Most focal here meaning:
This notebook is intended to be run from the command line with the following (assumes you are in the root directory of the repository):
Rscript -e "rmarkdown::render('analyses/focal-cn-file-preparation/05-define-most-focal-cn-units.Rmd', clean = TRUE)"
# The fraction of calls a particular status needs to be
# above to be called the majority status -- the decision
# for a cutoff of 90% here was made to ensure that the status
# is not only the majority status but it is also significantly
# called more than the other status values in the region
status_threshold <- 0.9
# The fraction threshold for determining if enough of a region
# (arm, cytoband, or gene) is callable to determine its status --
# the decision for a cutoff of 50% here was made as it seems reasonable
# to expect a region to be more than 50% callable for a dominant status
# call to be made
uncallable_threshold <- 0.5
library(tidyverse)
── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.2.0 ✔ purrr 0.3.2
✔ tibble 2.1.3 ✔ dplyr 0.8.3
✔ tidyr 0.8.3 ✔ stringr 1.4.0
✔ readr 1.3.1 ✔ forcats 0.4.0
── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
results_dir <- "results"
Read in cytoband status file and format it for what we will need in this notebook.
# Read in the file with consensus CN status data and the UCSC cytoband data --
# generated in `03-add-cytoband-status-consensus.Rmd`
consensus_seg_cytoband_status_df <-
read_tsv(file.path(results_dir, "consensus_seg_with_ucsc_cytoband_status.tsv.gz")) %>%
# Need this to not have `chr`
mutate(
chr = gsub("chr", "", chr),
cytoband = paste0(chr, cytoband)
) %>%
select(
chromosome_arm,
# Distinguish this dominant status that is based on cytobands, from the status
dominant_cytoband_status = dominant_status,
cytoband,
Kids_First_Biospecimen_ID,
band_length,
gain_fraction,
loss_fraction,
callable_fraction
)
Parsed with column specification:
cols(
Kids_First_Biospecimen_ID = col_character(),
chr = col_character(),
cytoband = col_character(),
dominant_status = col_character(),
band_length = col_double(),
callable_fraction = col_double(),
gain_fraction = col_double(),
loss_fraction = col_double(),
chromosome_arm = col_character()
)
Read in the gene-level data.
# Read in the annotated gene level CN file
consensus_seg_autosomes_df <-
read_tsv(file.path(results_dir, "consensus_seg_annotated_cn_autosomes.tsv.gz")) %>%
mutate(chromosome_arm = gsub("(p|q).*", "\\1", cytoband))
Parsed with column specification:
cols(
biospecimen_id = col_character(),
status = col_character(),
copy_number = col_double(),
ploidy = col_double(),
ensembl = col_character(),
gene_symbol = col_character(),
cytoband = col_character(),
pct_overlap = col_double()
)
# Rename "amplification" status calls to be "gain" for the purpose of this script
consensus_seg_autosomes_df$status <-
gsub("amplification", "gain", consensus_seg_autosomes_df$status)
consensus_seg_arm_status <- consensus_seg_cytoband_status_df %>%
# Group by biospecimen ID and region
group_by(Kids_First_Biospecimen_ID, chromosome_arm) %>%
# Summarize the weighted means for each status
summarize(
loss_fraction_arm = weighted.mean(loss_fraction, band_length),
gain_fraction_arm = weighted.mean(gain_fraction, band_length),
callable_fraction_arm = weighted.mean(callable_fraction, band_length)
) %>%
# Define dominant status based the weighted means meeting a status
# threshold
mutate(
dominant_arm_status = case_when(
callable_fraction_arm < (1 - uncallable_threshold) ~ "uncallable",
loss_fraction_arm > status_threshold ~ "loss",
gain_fraction_arm > status_threshold ~ "gain",
loss_fraction_arm + gain_fraction_arm > status_threshold ~ "unstable",
TRUE ~ "neutral"
)
)
# Display table
consensus_seg_arm_status
We want to include cytoband and gene-level calls for chromosome arms that have not been defined as a gain or loss to make the cytoband-level majority calls.
# Now define the cytoband as that status if more than the `status_threshold`
# fraction value of the total counts are for that particular status
consensus_seg_cytoband_status <-
consensus_seg_cytoband_status_df %>%
mutate(
dominant_cytoband_status = case_when(
callable_fraction < (1 - uncallable_threshold) ~ "uncallable",
loss_fraction > status_threshold ~ "loss",
gain_fraction > status_threshold ~ "gain",
loss_fraction + gain_fraction > status_threshold ~ "unstable",
TRUE ~ "neutral"
)
)
# Join the consensus seg arm status data and filter to include only neutral
# chromosome arms and non-neutral cytobands
filtered_consensus_cytoband_status <- consensus_seg_cytoband_status %>%
left_join(
consensus_seg_arm_status,
by = c(
"Kids_First_Biospecimen_ID",
"chromosome_arm"
)
) %>%
# Filter the annotated CN data to include only neutral chromosome arms and disagreements
filter(
dominant_arm_status %in% c("neutral", "uncallable", "unstable") |
(
dominant_cytoband_status != dominant_arm_status &
# bands that disagree with arm, but are not neutral (or uncallable)
!(dominant_cytoband_status %in% c("neutral", "uncallable", "unstable"))
)
) %>%
select(
Kids_First_Biospecimen_ID,
cytoband,
loss_fraction,
gain_fraction,
callable_fraction,
dominant_cytoband_status
)
# Display table
filtered_consensus_cytoband_status
# Create separate rows for genes span multiple cytobands
# CAUTION: This will require addition of a distinct() statment later to resolve duplicates
# Filtering to only multiband genes first because regexes are slow.
multi_band_genes <- consensus_seg_autosomes_df %>%
filter(grepl("-", cytoband)) %>%
extract(cytoband, into = c("chrom", "band"), regex = "([0-9]+)(.+)") %>% # make chrom and band cols
separate_rows(band, sep = "-") %>% # duplicate rows with more than one band
unite(cytoband, chrom, band, sep = "") # rejoin chrom and band
# Filter to singleband genes
single_band_genes <- consensus_seg_autosomes_df %>%
filter(!grepl("-", cytoband))
gene_df <- bind_rows(single_band_genes, multi_band_genes)
# Now create a data.frame with the gene-level status calls for the
# neutral, uncallable, and unstable cytoband-level and chromosome arm-level
# calls
filtered_consensus_gene_status <- gene_df %>%
left_join(
consensus_seg_arm_status,
by = c("biospecimen_id" = "Kids_First_Biospecimen_ID",
"chromosome_arm")
) %>%
left_join(
consensus_seg_cytoband_status,
by = c("biospecimen_id" = "Kids_First_Biospecimen_ID",
"cytoband")
) %>%
# Filter the annotated CN data to include only neutral arms and cytobands,
# and disagreements
filter(
# Case 1) Gene call disagrees with both arm and cytoband (This captures most
# we want to keep, including all of when arm and cytoband are neutral, uncallable
# or unstable, since we have no neutral gene calls in this df):
(
status != dominant_arm_status & status != dominant_cytoband_status
)
# Case 2) Gene call disagrees with a non-neutral cytoband call.
# Keep no matter what the arm status
| (
status != dominant_cytoband_status
& dominant_cytoband_status %in% c("gain", "loss")
)
# I think that captures everything we want. Cases we don't want include:
# gene & arm agree, but cytoband is neutral
# gene & cytoband agree
# all 3 agree
) %>%
select(Kids_First_Biospecimen_ID = biospecimen_id,
gene_symbol,
status) %>%
# The `distinct()` function is needed to remove duplicates resulting from
# the band separation into multiple rows step above
distinct()
# Display table
filtered_consensus_gene_status
# Rename each the dominant status columns of the data.frames
# to be uniform for the binding rows step and filter out "neutral" calls
consensus_seg_arm_status <- consensus_seg_arm_status %>%
filter(!(dominant_arm_status == "neutral")) %>%
rename(dominant_status = dominant_arm_status)
filtered_consensus_cytoband_status <- filtered_consensus_cytoband_status %>%
filter(!(dominant_cytoband_status == "neutral")) %>%
rename(dominant_status = dominant_cytoband_status)
# There are no "neutral" calls at the gene level so we do not need to filter
# out those calls here
filtered_consensus_gene_status <- filtered_consensus_gene_status %>%
rename(dominant_status = status)
# For each of the datasets we're joining, we'll only keep these columns:
cols_to_keep <- c("Kids_First_Biospecimen_ID", "dominant_status")
Combine into one long data frame
# Combine the arm, cytoband, and gene status count data
final_df <- bind_rows(
arm = select(consensus_seg_arm_status,
cols_to_keep,
region = chromosome_arm),
cytoband = select(filtered_consensus_cytoband_status,
cols_to_keep,
region = cytoband),
gene = select(filtered_consensus_gene_status,
cols_to_keep,
region = gene_symbol),
.id = "region_type"
) %>%
# Reorder columns more sensibly
select(Kids_First_Biospecimen_ID,
status = dominant_status,
region,
region_type) %>%
arrange(Kids_First_Biospecimen_ID)
# Print out preview
final_df
# Write final long status table to file
write_tsv(
final_df,
file.path(results_dir, "consensus_seg_most_focal_cn_status.tsv.gz")
)
# Display final long status table
final_df %>%
arrange(region_type)
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
[5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0
[9] tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 cellranger_1.1.0 pillar_1.4.2 compiler_3.6.0
[5] base64enc_0.1-3 tools_3.6.0 digest_0.6.20 lubridate_1.7.4
[9] jsonlite_1.6 evaluate_0.14 nlme_3.1-140 gtable_0.3.0
[13] lattice_0.20-38 pkgconfig_2.0.2 rlang_0.4.0 cli_1.1.0
[17] rstudioapi_0.10 yaml_2.2.0 haven_2.1.1 xfun_0.8
[21] withr_2.1.2 xml2_1.2.0 httr_1.4.0 knitr_1.23
[25] hms_0.4.2 generics_0.0.2 grid_3.6.0 tidyselect_0.2.5
[29] glue_1.3.1 R6_2.4.0 readxl_1.3.1 rmarkdown_1.13
[33] modelr_0.1.4 magrittr_1.5 backports_1.1.4 scales_1.0.0
[37] htmltools_0.3.6 rvest_0.3.4 assertthat_0.2.1 colorspace_1.4-1
[41] stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 broom_0.5.2
[45] crayon_1.3.4